import numpy as np
import pandas as pd
attendance = pd.read_csv('attendance.csv')
attendance.shape #Attendance is required
(964, 4)
matches = pd.read_csv('matches.csv')
matches.shape #all columns required
(964, 37)
stadiums = pd.read_csv('stadiums.csv')
stadiums.shape #stadium_capacity is required
(193, 8)
attendance.columns
Index(['home_team', 'away_team', 'Attendance', 'Date'], dtype='object')
matches.columns
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
'city_name', 'country_name', 'home_team_id', 'home_team_name',
'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
'draw'],
dtype='object')
stadiums.columns
Index(['key_id', 'stadium_id', 'stadium_name', 'city_name', 'country_name',
'stadium_capacity', 'stadium_wikipedia_link', 'city_wikipedia_link'],
dtype='object')
attendance = attendance.rename(columns={"home_team": "home_team_name", "away_team": "away_team_name","Date":"match_date"})
attendance.columns
Index(['home_team_name', 'away_team_name', 'Attendance', 'match_date'], dtype='object')
df = pd.merge(matches, attendance, how='inner', on=["match_date", "home_team_name","away_team_name"])
df.shape
(850, 38)
teams1 = attendance.home_team_name.unique()
teams2 = matches.home_team_name.unique()
teams3 = [x for x in teams1 if x not in teams2]
teams3
['Korea Republic', 'IR Iran', "Côte d'Ivoire", 'Korea DPR', 'Türkiye', 'China PR', 'FR Yugoslavia', 'Germany DR']
teams4 = [x for x in teams2 if x not in teams1]
teams4
['Turkey', 'North Korea', 'East Germany', 'South Korea', 'China', 'Iran', 'Ivory Coast']
rep = {'Korea Republic':'South Korea','Korea DPR':'North Korea','Côte d\'Ivoire' : 'Ivory Coast','Germany DR' : 'East Germany','IR Iran' : 'Iran', 'China PR' : 'China','FR Yugoslavia' : 'Yugoslavia','Türkiye' : 'Turkey'}
Korea Republic = South Korea, Korea DPR = North Korea, Côte d'Ivoire = Ivory Coast, Germany DR = East Germany, IR Iran = Iran, China PR = China, FR Yugoslavia = Yugoslavia, Türkiye = Turkey
attendance['home_team_name'] = attendance['home_team_name'].replace(rep)
attendance['away_team_name'] = attendance['away_team_name'].replace(rep)
df = pd.merge(matches, attendance, how='outer', on=["match_date", "home_team_name","away_team_name"])
df.shape
(991, 38)
teams = pd.read_csv('teams.csv')
teams_dict = pd.Series(teams.team_code.values,index=teams.team_name).to_dict()
attendance['home_team_name'] = attendance['home_team_name'].replace(teams_dict)
attendance['away_team_name'] = attendance['away_team_name'].replace(teams_dict)
attendance = attendance.rename(columns={"home_team_name": "home_team_code", "away_team_name": "away_team_code"})
df = pd.merge(matches, attendance, how='inner', on=["match_date", "home_team_code","away_team_code"])
df.shape
(943, 38)
attendance = attendance.rename(columns={"home_team_code": "away_team_code", "away_team_code": "home_team_code"})
df2 = pd.merge(matches, attendance, how='inner', on=["match_date", "home_team_code","away_team_code"])
df2.shape
(21, 38)
df = pd.merge(df, df2, how='outer')
df = df.sort_values(by=['match_id'])
df.shape
(964, 38)
df = df.dropna()
df.shape
(964, 38)
matches = pd.merge(df, stadiums[['stadium_id','stadium_capacity']], how='inner', on=["stadium_id"])
matches.shape
(964, 39)
matches.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | penalty_shootout | score_penalties | home_team_score_penalties | away_team_score_penalties | result | home_team_win | away_team_win | draw | Attendance | stadium_capacity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 4444 | 10000 |
| 1 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 2549 | 10000 |
| 2 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 18346 | 20000 |
| 3 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 24059 | 20000 |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 23409 | 20000 |
5 rows × 39 columns
players = pd.read_csv('players.csv')
players.shape
(8485, 12)
squads = pd.read_csv('squads.csv')
squads.shape
(10973, 12)
players.columns #player_id, given_name, family_name, count_tournaments, list_tournaments
Index(['key_id', 'player_id', 'family_name', 'given_name', 'birth_date',
'goal_keeper', 'defender', 'midfielder', 'forward', 'count_tournaments',
'list_tournaments', 'player_wikipedia_link'],
dtype='object')
squads.columns #team_name, team_code, count_team
Index(['key_id', 'tournament_id', 'tournament_name', 'team_id', 'team_name',
'team_code', 'player_id', 'family_name', 'given_name', 'shirt_number',
'position_name', 'position_code'],
dtype='object')
team_names = squads.groupby('player_id').agg({'team_name': lambda x: list(set(x))}).reset_index().rename(columns={"team_name": "list_team_name"})
team_codes = squads.groupby('player_id').agg({'team_code': lambda x: list(set(x))}).reset_index().rename(columns={"team_code": "list_team_code"})
count_team = squads.groupby('player_id').agg({'team_code': lambda x: len(set(x))}).reset_index().rename(columns={"team_code": "count_team"})
players_teams = pd.merge(players[['player_id','given_name','family_name', 'count_tournaments', 'list_tournaments']], team_names[['player_id','list_team_name']], how='outer', on=["player_id"])
players_teams = pd.merge(players_teams, team_codes[['player_id','list_team_code']], how='outer', on=["player_id"])
players_teams = pd.merge(players_teams, count_team[['player_id','count_team']], how='outer', on=["player_id"])
players_teams.head()
| player_id | given_name | family_name | count_tournaments | list_tournaments | list_team_name | list_team_code | count_team | |
|---|---|---|---|---|---|---|---|---|
| 0 | P-08891 | Alan | A'Court | 1 | 1958 | [England] | [ENG] | 1 |
| 1 | P-08589 | Brenden | Aaronson | 1 | 2022 | [United States] | [USA] | 1 |
| 2 | P-04897 | Stefan | Abadzhiev | 1 | 1966 | [Bulgaria] | [BGR] | 1 |
| 3 | P-05556 | Jean-Paul | Abalo | 1 | 2006 | [Togo] | [TGO] | 1 |
| 4 | P-08163 | Patrice | Abanda | 1 | 1998 | [Cameroon] | [CMR] | 1 |
players_teams.shape
(8485, 8)
matches['total_goals_in_match'] = matches['home_team_score'] + matches['away_team_score']
matches[['home_team_score','away_team_score','total_goals_in_match']].head()
| home_team_score | away_team_score | total_goals_in_match | |
|---|---|---|---|
| 0 | 4 | 1 | 5 |
| 1 | 3 | 1 | 4 |
| 2 | 3 | 0 | 3 |
| 3 | 2 | 1 | 3 |
| 4 | 1 | 0 | 1 |
matches.columns
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
'city_name', 'country_name', 'home_team_id', 'home_team_name',
'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
'draw', 'Attendance', 'stadium_capacity', 'total_goals_in_match'],
dtype='object')
matches['match_for_host'] = matches.apply(lambda x: True if teams_dict[x['country_name']] == x['home_team_code'] or teams_dict[x['country_name']] == x['away_team_code'] else False, axis= 1)
matches[['home_team_name','away_team_name','country_name','match_for_host']].head()
| home_team_name | away_team_name | country_name | match_for_host | |
|---|---|---|---|---|
| 0 | France | Mexico | Uruguay | False |
| 1 | Romania | Peru | Uruguay | False |
| 2 | United States | Belgium | Uruguay | False |
| 3 | Yugoslavia | Brazil | Uruguay | False |
| 4 | Argentina | France | Uruguay | False |
matches['used_capacity_ratio'] = matches.apply(lambda x: x['Attendance']/x['stadium_capacity'], axis= 1)
matches[['match_name','used_capacity_ratio']].head()
| match_name | used_capacity_ratio | |
|---|---|---|
| 0 | France v Mexico | 0.44440 |
| 1 | Romania v Peru | 0.25490 |
| 2 | United States v Belgium | 0.91730 |
| 3 | Yugoslavia v Brazil | 1.20295 |
| 4 | Argentina v France | 1.17045 |
matches['Attendance'].min()
2000
matches['Attendance'].max()
173850
matches['Attendance'].mean()
45693.3744813278
matches['attendance_category'] = pd.cut(x=matches['Attendance'], bins=[2000, 30000, 50000, 180000], labels=['Low', 'Medium', 'High'])
matches[['match_name','Attendance','attendance_category']].head()
| match_name | Attendance | attendance_category | |
|---|---|---|---|
| 0 | France v Mexico | 4444 | Low |
| 1 | Romania v Peru | 2549 | Low |
| 2 | United States v Belgium | 18346 | Low |
| 3 | Yugoslavia v Brazil | 24059 | Low |
| 4 | Argentina v France | 23409 | Low |
matches['used_capacity_ratio'].min()
0.022222222222222223
matches['used_capacity_ratio'].max()
2.1129032258064515
matches['used_capacity_ratio'].mean()
0.8311335805127023
matches['relative_attendance_category'] = pd.cut(x=matches['used_capacity_ratio'], bins=[0.02, 0.8, 1, 2.2], labels=['Low', 'Medium', 'High'])
matches[['match_name','Attendance','relative_attendance_category']].head()
| match_name | Attendance | relative_attendance_category | |
|---|---|---|---|
| 0 | France v Mexico | 4444 | Low |
| 1 | Romania v Peru | 2549 | Low |
| 2 | United States v Belgium | 18346 | Medium |
| 3 | Yugoslavia v Brazil | 24059 | High |
| 4 | Argentina v France | 23409 | High |
matches['host_country_code'] = matches.apply(lambda x: teams_dict[x['country_name']] , axis= 1)
matches[['match_name','country_name','host_country_code']].head()
| match_name | country_name | host_country_code | |
|---|---|---|---|
| 0 | France v Mexico | Uruguay | URY |
| 1 | Romania v Peru | Uruguay | URY |
| 2 | United States v Belgium | Uruguay | URY |
| 3 | Yugoslavia v Brazil | Uruguay | URY |
| 4 | Argentina v France | Uruguay | URY |
tournaments = pd.read_csv('tournaments.csv')
tournaments.shape
(22, 18)
tournaments.columns
Index(['key_id', 'tournament_id', 'tournament_name', 'year', 'start_date',
'end_date', 'host_country', 'winner', 'host_won', 'count_teams',
'group_stage', 'second_group_stage', 'final_round', 'round_of_16',
'quarter_finals', 'semi_finals', 'third_place_match', 'final'],
dtype='object')
matches = pd.merge(matches, tournaments[["tournament_id", "year"]], how='inner', on=["tournament_id"]).rename(columns={"year": "tournament_year"})
matches.shape
(964, 46)
matches[['match_name','country_name','host_country_code', 'tournament_year']].head()
| match_name | country_name | host_country_code | tournament_year | |
|---|---|---|---|---|
| 0 | France v Mexico | Uruguay | URY | 1930 |
| 1 | Romania v Peru | Uruguay | URY | 1930 |
| 2 | United States v Belgium | Uruguay | URY | 1930 |
| 3 | Yugoslavia v Brazil | Uruguay | URY | 1930 |
| 4 | Argentina v France | Uruguay | URY | 1930 |
players_teams.columns
Index(['player_id', 'given_name', 'family_name', 'count_tournaments',
'list_tournaments', 'list_team_name', 'list_team_code', 'count_team'],
dtype='object')
def get_full_name(x):
if x['given_name'] == 'not applicable':
return x['family_name']
elif x['family_name'] == 'not applicable':
return x['given_name']
else:
return x['given_name']+' '+x['family_name']
players_teams['full_name'] = players_teams.apply(get_full_name, axis= 1)
players_teams[['given_name', 'family_name', 'full_name']].head()
| given_name | family_name | full_name | |
|---|---|---|---|
| 0 | Alan | A'Court | Alan A'Court |
| 1 | Brenden | Aaronson | Brenden Aaronson |
| 2 | Stefan | Abadzhiev | Stefan Abadzhiev |
| 3 | Jean-Paul | Abalo | Jean-Paul Abalo |
| 4 | Patrice | Abanda | Patrice Abanda |
matches.columns
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
'city_name', 'country_name', 'home_team_id', 'home_team_name',
'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
'draw', 'Attendance', 'stadium_capacity', 'total_goals_in_match',
'match_for_host', 'used_capacity_ratio', 'attendance_category',
'relative_attendance_category', 'host_country_code', 'tournament_year'],
dtype='object')
matches['short_stage_name'] = matches.apply(lambda x: 'group_stage' if x['group_stage'] == 1 else 'knockout_stage' , axis= 1)
matches[['match_name','country_name','host_country_code', 'tournament_year','short_stage_name']].head()
| match_name | country_name | host_country_code | tournament_year | short_stage_name | |
|---|---|---|---|---|---|
| 0 | France v Mexico | Uruguay | URY | 1930 | group_stage |
| 1 | Romania v Peru | Uruguay | URY | 1930 | group_stage |
| 2 | United States v Belgium | Uruguay | URY | 1930 | group_stage |
| 3 | Yugoslavia v Brazil | Uruguay | URY | 1930 | group_stage |
| 4 | Argentina v France | Uruguay | URY | 1930 | group_stage |
tournaments.columns
Index(['key_id', 'tournament_id', 'tournament_name', 'year', 'start_date',
'end_date', 'host_country', 'winner', 'host_won', 'count_teams',
'group_stage', 'second_group_stage', 'final_round', 'round_of_16',
'quarter_finals', 'semi_finals', 'third_place_match', 'final'],
dtype='object')
tournaments['winner_code'] = tournaments.apply(lambda x: teams_dict[x['winner']], axis= 1)
tournaments[['tournament_id', 'tournament_name', 'year', 'winner','winner_code']].head()
| tournament_id | tournament_name | year | winner | winner_code | |
|---|---|---|---|---|---|
| 0 | WC-1930 | 1930 FIFA World Cup | 1930 | Uruguay | URY |
| 1 | WC-1934 | 1934 FIFA World Cup | 1934 | Italy | ITA |
| 2 | WC-1938 | 1938 FIFA World Cup | 1938 | Italy | ITA |
| 3 | WC-1950 | 1950 FIFA World Cup | 1950 | Uruguay | URY |
| 4 | WC-1954 | 1954 FIFA World Cup | 1954 | West Germany | DEU |
goals = pd.read_csv('goals.csv')
goals[['minute_label', 'minute_regulation', 'minute_stoppage', 'match_period']].head()
| minute_label | minute_regulation | minute_stoppage | match_period | |
|---|---|---|---|---|
| 0 | 19' | 19 | 0 | first half |
| 1 | 40' | 40 | 0 | first half |
| 2 | 43' | 43 | 0 | first half |
| 3 | 70' | 70 | 0 | second half |
| 4 | 87' | 87 | 0 | second half |
goals.match_period.unique()
array(['first half', 'second half', 'extra time, first half',
'extra time, second half', 'second half, stoppage time',
'first half, stoppage time',
'extra time, second half, stoppage time',
'extra time, first half, stoppage time'], dtype=object)
def check_late_goal(data):
minute = data['minute_regulation']
match_period = data['match_period']
if (((minute > 40) and (match_period in ['first half', 'first half, stoppage time'])) or
((minute > 85) and (match_period in ['second half', 'second half, stoppage time'])) or
((minute > 100) and (match_period in ['extra time, first half', 'extra time, first half, stoppage time'])) or
((minute > 115) and (match_period in ['extra time, second half', 'extra time, second half, stoppage time']))):
return True
return False
goals['late_goal'] = goals.apply(check_late_goal, axis= 1)
goals.head()
| key_id | goal_id | tournament_id | tournament_name | match_id | match_name | match_date | stage_name | group_name | team_id | ... | player_team_id | player_team_name | player_team_code | minute_label | minute_regulation | minute_stoppage | match_period | own_goal | penalty | late_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | G-0001 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 19' | 19 | 0 | first half | 0 | 0 | False |
| 1 | 2 | G-0002 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 40' | 40 | 0 | first half | 0 | 0 | False |
| 2 | 3 | G-0003 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 43' | 43 | 0 | first half | 0 | 0 | True |
| 3 | 4 | G-0004 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-44 | ... | T-44 | Mexico | MEX | 70' | 70 | 0 | second half | 0 | 0 | False |
| 4 | 5 | G-0005 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 87' | 87 | 0 | second half | 0 | 0 | True |
5 rows × 28 columns
import plotly.express as px
import plotly.graph_objects as go
result = matches.groupby(['tournament_year'])['Attendance'].agg(['median', 'mean']).reset_index()
fig = px.line(result,x ='tournament_year', y = ['mean','median'])
fig.update_layout(
xaxis=dict(
tickmode='array', # Set the tick mode to 'array'
tickvals=result['tournament_year'],
ticktext=result['tournament_year'],
tickangle=90,
tickfont=dict(size=14)
)
)
fig.show()
result = matches.groupby(['tournament_year'])['used_capacity_ratio'].agg(['median', 'mean']).reset_index()
fig = px.line(result,x ='tournament_year', y = ['mean','median'])
fig.update_layout(
xaxis=dict(
tickmode='array', # Set the tick mode to 'array'
tickvals=result['tournament_year'],
ticktext=result['tournament_year'],
tickangle=90,
tickfont=dict(size=14)
)
)
fig.show()
result = matches.groupby(['country_name','tournament_year'])['Attendance'].agg(['median', 'mean']).reset_index()
fig = px.line(result,x ='country_name', y = ['mean','median'])
fig.show()
result = matches.groupby(['country_name','tournament_year'])['used_capacity_ratio'].agg(['median', 'mean']).reset_index()
fig = px.line(result,x ='country_name', y = ['mean','median'])
fig.show()
fig = px.histogram(matches, x="Attendance", nbins=19)
fig.update_layout(bargap = 0.1)
fig.show()
fig = px.histogram(matches, x="used_capacity_ratio", nbins=11)
fig.show()
result = matches.sort_values(by = 'used_capacity_ratio', ascending = False)
result[['tournament_year','home_team_name','away_team_name','country_name','used_capacity_ratio','total_goals_in_match']].head(1)
| tournament_year | home_team_name | away_team_name | country_name | used_capacity_ratio | total_goals_in_match | |
|---|---|---|---|---|---|---|
| 432 | 1986 | Canada | France | Mexico | 2.112903 | 1 |
fig = go.Figure()
for date in matches['tournament_year'].unique():
fig.add_trace(go.Box(
x=matches[matches['tournament_year'] == date]['tournament_year'],
y=matches[matches['tournament_year'] == date]['Attendance'],
name=str(date)
))
fig.update_layout(
title='Attendance Distribution Over Years',
xaxis=dict(title='tournament_year'),
yaxis=dict(title='Attendance')
)
fig.show()
fig = go.Figure()
# Iterate over each year and add a box plot trace
for date in matches['tournament_year'].unique():
fig.add_trace(go.Box(
x=matches[matches['tournament_year'] == date]['tournament_year'],
y=matches[matches['tournament_year'] == date]['used_capacity_ratio'],
name=str(date)
))
# Customize the layout
fig.update_layout(
title='used_capacity_ratio Distribution Over Years',
xaxis=dict(title='tournament_year'),
yaxis=dict(title='used_capacity_ratio')
)
# Display the chart
fig.show()
result = goals.groupby(['tournament_id'])['minute_regulation'].agg(['median', 'mean']).reset_index()
fig = px.bar(result,x ='tournament_id', y = ['mean'])
fig.update_layout(
xaxis=dict(
tickmode='array', # Set the tick mode to 'array'
tickvals=result['tournament_id'],
ticktext=result['tournament_id'],
tickangle=45,
tickfont=dict(size=14)
)
)
fig.show()
matches['total_goals_in_match'].max()
12
matches['total_goals_in_match'].min()
0
fig = px.histogram(matches, x="total_goals_in_match",nbins=13)
fig.update_layout(bargap=0.1)
fig.show()
result = goals.groupby('tournament_id').agg({'minute_regulation': pd.Series.mode, 'match_period': pd.Series.mode})
result
| minute_regulation | match_period | |
|---|---|---|
| tournament_id | ||
| WC-1930 | 65 | second half |
| WC-1934 | [18, 29] | [first half, second half] |
| WC-1938 | [35, 44, 89] | first half |
| WC-1950 | 17 | second half |
| WC-1954 | [18, 30, 54, 60, 69, 78, 84, 85] | second half |
| WC-1958 | [4, 18, 24, 32, 44, 52, 55] | second half |
| WC-1962 | [17, 29, 56, 73, 90] | second half |
| WC-1966 | [15, 43, 75, 88] | second half |
| WC-1970 | 76 | second half |
| WC-1974 | 18 | second half |
| WC-1978 | [43, 45] | first half |
| WC-1982 | [68, 75, 83] | second half |
| WC-1986 | 62 | second half |
| WC-1990 | 81 | second half |
| WC-1994 | 90 | second half |
| WC-1998 | 90 | second half |
| WC-2002 | 45 | second half |
| WC-2006 | 90 | first half |
| WC-2010 | 90 | second half |
| WC-2014 | 90 | second half |
| WC-2018 | 90 | second half |
| WC-2022 | 90 | second half |
result = goals.groupby(["tournament_id"])['late_goal'].agg(pd.Series.sum).reset_index().rename(columns={"late_goal": "late_goals_in_tournament"})
result = result.sort_values(by = 'tournament_id', ascending = True)
result
| tournament_id | late_goals_in_tournament | |
|---|---|---|
| 0 | WC-1930 | 8 |
| 1 | WC-1934 | 8 |
| 2 | WC-1938 | 17 |
| 3 | WC-1950 | 7 |
| 4 | WC-1954 | 12 |
| 5 | WC-1958 | 14 |
| 6 | WC-1962 | 12 |
| 7 | WC-1966 | 15 |
| 8 | WC-1970 | 12 |
| 9 | WC-1974 | 12 |
| 10 | WC-1978 | 19 |
| 11 | WC-1982 | 10 |
| 12 | WC-1986 | 16 |
| 13 | WC-1990 | 15 |
| 14 | WC-1994 | 30 |
| 15 | WC-1998 | 35 |
| 16 | WC-2002 | 28 |
| 17 | WC-2006 | 29 |
| 18 | WC-2010 | 21 |
| 19 | WC-2014 | 31 |
| 20 | WC-2018 | 38 |
| 21 | WC-2022 | 39 |
fig = px.histogram(result, x="late_goals_in_tournament", nbins=20)
fig.show()
players_goals = goals.groupby(["player_id"])['key_id'].agg(['count']).reset_index().rename(columns={"count": "goals_number"})
players_goals = pd.merge(players_goals, players_teams[['player_id','full_name','list_team_name']], how='inner', on=["player_id"])
players_goals.head()
| player_id | goals_number | full_name | list_team_name | |
|---|---|---|---|---|
| 0 | P-00004 | 1 | Julio César Cortés | [Uruguay] |
| 1 | P-00010 | 1 | Michel De Wolf | [Belgium] |
| 2 | P-00013 | 2 | Vincenzo Iaquinta | [Italy] |
| 3 | P-00021 | 5 | Hans Krankl | [Austria] |
| 4 | P-00032 | 1 | Antonio Rada | [Colombia] |
player_appearances = pd.read_csv('player_appearances.csv')
player_matches = player_appearances.groupby(['player_id'])['match_id'].agg(['count']).reset_index()
player_matches.head()
| player_id | count | |
|---|---|---|
| 0 | P-00001 | 14 |
| 1 | P-00003 | 3 |
| 2 | P-00004 | 6 |
| 3 | P-00005 | 4 |
| 4 | P-00009 | 1 |
result = pd.merge(players_goals, player_matches, how='inner', on=["player_id"])
result['goals_matches'] = result['goals_number']/result['count']
result = result.sort_values(by = ['goals_number','count'], ascending = [False,True])
result.head()
| player_id | goals_number | full_name | list_team_name | count | goals_matches | |
|---|---|---|---|---|---|---|
| 585 | P-05224 | 16 | Miroslav Klose | [Germany] | 24 | 0.666667 |
| 957 | P-08490 | 15 | Ronaldo | [Brazil] | 19 | 0.789474 |
| 246 | P-02173 | 14 | Gerd Müller | [West Germany] | 13 | 1.076923 |
| 390 | P-03429 | 13 | Lionel Messi | [Argentina] | 26 | 0.500000 |
| 1048 | P-09516 | 12 | Pelé | [Brazil] | 6 | 2.000000 |
best_players = result.head(12)
best_players
| player_id | goals_number | full_name | list_team_name | count | goals_matches | |
|---|---|---|---|---|---|---|
| 585 | P-05224 | 16 | Miroslav Klose | [Germany] | 24 | 0.666667 |
| 957 | P-08490 | 15 | Ronaldo | [Brazil] | 19 | 0.789474 |
| 246 | P-02173 | 14 | Gerd Müller | [West Germany] | 13 | 1.076923 |
| 390 | P-03429 | 13 | Lionel Messi | [Argentina] | 26 | 0.500000 |
| 1048 | P-09516 | 12 | Pelé | [Brazil] | 6 | 2.000000 |
| 800 | P-06978 | 12 | Kylian Mbappé | [France] | 14 | 0.857143 |
| 747 | P-06604 | 11 | Jürgen Klinsmann | [Germany, West Germany] | 17 | 0.647059 |
| 41 | P-00324 | 10 | Gary Lineker | [England] | 12 | 0.833333 |
| 666 | P-05966 | 10 | Gabriel Batistuta | [Argentina] | 12 | 0.833333 |
| 200 | P-01728 | 10 | Teófilo Cubillas | [Peru] | 13 | 0.769231 |
| 576 | P-05090 | 10 | Thomas Müller | [Germany] | 19 | 0.526316 |
| 288 | P-02554 | 10 | Grzegorz Lato | [Poland] | 20 | 0.500000 |
fig = px.bar(best_players,x ='full_name', y = 'goals_number')
fig.show()
best = goals.groupby(['tournament_id','player_id'])['key_id'].count().reset_index().rename(columns={"key_id": "goals_number"})
best.head()
| tournament_id | player_id | goals_number | |
|---|---|---|---|
| 0 | WC-1930 | P-00033 | 1 |
| 1 | WC-1930 | P-00159 | 1 |
| 2 | WC-1930 | P-00280 | 1 |
| 3 | WC-1930 | P-00780 | 1 |
| 4 | WC-1930 | P-01708 | 8 |
best = best.sort_values(by = 'goals_number', ascending = False).drop_duplicates('tournament_id')
best
| tournament_id | player_id | goals_number | |
|---|---|---|---|
| 250 | WC-1958 | P-02537 | 13 |
| 217 | WC-1954 | P-06982 | 11 |
| 407 | WC-1970 | P-02173 | 10 |
| 367 | WC-1966 | P-04623 | 9 |
| 140 | WC-1950 | P-03149 | 9 |
| 4 | WC-1930 | P-01708 | 8 |
| 1104 | WC-2002 | P-08490 | 8 |
| 1653 | WC-2022 | P-06978 | 8 |
| 461 | WC-1974 | P-02554 | 7 |
| 94 | WC-1938 | P-02781 | 7 |
| 1457 | WC-2018 | P-00376 | 6 |
| 891 | WC-1994 | P-08714 | 6 |
| 921 | WC-1998 | P-01739 | 6 |
| 569 | WC-1982 | P-00907 | 6 |
| 1365 | WC-2014 | P-03168 | 6 |
| 513 | WC-1978 | P-01718 | 6 |
| 667 | WC-1986 | P-00324 | 6 |
| 810 | WC-1990 | P-08408 | 6 |
| 1187 | WC-2006 | P-05224 | 5 |
| 53 | WC-1934 | P-03976 | 5 |
| 1325 | WC-2010 | P-09223 | 5 |
| 312 | WC-1962 | P-03057 | 4 |
best = pd.merge(players_teams[['player_id', 'full_name']], best, how='inner', on=["player_id"])
best
| player_id | full_name | tournament_id | goals_number | |
|---|---|---|---|---|
| 0 | P-03149 | Ademir | WC-1950 | 9 |
| 1 | P-04623 | Eusébio | WC-1966 | 9 |
| 2 | P-02537 | Just Fontaine | WC-1958 | 13 |
| 3 | P-09223 | Diego Forlán | WC-2010 | 5 |
| 4 | P-00376 | Harry Kane | WC-2018 | 6 |
| 5 | P-01718 | Mario Kempes | WC-1978 | 6 |
| 6 | P-05224 | Miroslav Klose | WC-2006 | 5 |
| 7 | P-06982 | Sándor Kocsis | WC-1954 | 11 |
| 8 | P-02554 | Grzegorz Lato | WC-1974 | 7 |
| 9 | P-02781 | Leônidas | WC-1938 | 7 |
| 10 | P-00324 | Gary Lineker | WC-1986 | 6 |
| 11 | P-06978 | Kylian Mbappé | WC-2022 | 8 |
| 12 | P-02173 | Gerd Müller | WC-1970 | 10 |
| 13 | P-03976 | Oldřich Nejedlý | WC-1934 | 5 |
| 14 | P-03168 | James Rodríguez | WC-2014 | 6 |
| 15 | P-08490 | Ronaldo | WC-2002 | 8 |
| 16 | P-00907 | Paolo Rossi | WC-1982 | 6 |
| 17 | P-03057 | Leonel Sánchez | WC-1962 | 4 |
| 18 | P-08408 | Salvatore Schillaci | WC-1990 | 6 |
| 19 | P-01708 | Guillermo Stábile | WC-1930 | 8 |
| 20 | P-08714 | Hristo Stoichkov | WC-1994 | 6 |
| 21 | P-01739 | Davor Šuker | WC-1998 | 6 |
best["tournament_id&player_full_name"] = best["tournament_id"]+': '+best["full_name"]
fig = px.bar(best, x="tournament_id&player_full_name", y="goals_number")
fig.show()
selected_teams = ['BRA', 'DEU', 'ITA']
filtered_goals = goals[goals['team_code'].isin(selected_teams)]
fig = px.strip(filtered_goals, x='stage_name', y='minute_regulation', color='team_code')
fig.update_layout(
title='World Cup Goals',
xaxis_title='Stage',
yaxis_title='Goal Time',
legend_title='Team'
)
fig.show()
temp = matches
temp['team_pair'] = temp[['home_team_code', 'away_team_code']].apply(lambda x: '-'.join(sorted(x)), axis=1)
temp['duplicate_counts'] = temp.groupby('team_pair')['team_pair'].transform('size')
result = temp[['home_team_name', 'away_team_name','team_pair','duplicate_counts']].drop_duplicates('team_pair').reset_index()
#result.team_pair.unique()
result = result.sort_values(by = 'duplicate_counts', ascending = False).head(10)
result
| index | home_team_name | away_team_name | team_pair | duplicate_counts | |
|---|---|---|---|---|---|
| 54 | 59 | West Germany | Argentina | ARG-DEU | 7 |
| 59 | 64 | Brazil | Sweden | BRA-SWE | 7 |
| 43 | 44 | West Germany | Yugoslavia | DEU-YUG | 6 |
| 105 | 116 | Netherlands | Argentina | ARG-NLD | 6 |
| 154 | 169 | Brazil | Mexico | BRA-MEX | 5 |
| 153 | 168 | Argentina | England | ARG-ENG | 5 |
| 106 | 117 | Brazil | Netherlands | BRA-NLD | 5 |
| 218 | 249 | Nigeria | Argentina | ARG-NGA | 5 |
| 93 | 100 | Brazil | Czechoslovakia | BRA-CSK | 5 |
| 28 | 29 | Spain | Brazil | BRA-ESP | 5 |
fig = px.bar(result, x="team_pair", y="duplicate_counts")
fig.show()
players_teams.columns
Index(['player_id', 'given_name', 'family_name', 'count_tournaments',
'list_tournaments', 'list_team_name', 'list_team_code', 'count_team',
'full_name'],
dtype='object')
filtered_players = players_teams[players_teams['count_team'] > 1]
filtered_players[['full_name','list_team_name',]]
| full_name | list_team_name | |
|---|---|---|
| 331 | José Altafini | [Brazil, Italy] |
| 1039 | Alen Bokšić | [Yugoslavia, Croatia] |
| 1087 | Aleksandr Borodyuk | [Russia, Soviet Union] |
| 2037 | Attilio Demaría | [Argentina, Italy] |
| 3036 | Sergei Gorlukovich | [Russia, Soviet Union] |
| 3719 | Robert Jarni | [Yugoslavia, Croatia] |
| 5200 | Savo Milošević | [Serbia and Montenegro, Yugoslavia] |
| 5304 | Luis Monti | [Argentina, Italy] |
| 6319 | Robert Prosinečki | [Yugoslavia, Croatia] |
| 6340 | Ferenc Puskás | [Hungary, Spain] |
| 6396 | Rudolf Raftl | [Germany, Austria] |
| 6903 | José Santamaría | [Spain, Uruguay] |
| 6974 | Willibald Schmaus | [Germany, Austria] |
| 7353 | Dejan Stanković | [Serbia and Montenegro, Yugoslavia, Serbia] |
| 7403 | Vladimir Stojković | [Serbia and Montenegro, Serbia] |
| 7424 | Josef Stroh | [Germany, Austria] |
| 7451 | Davor Šuker | [Yugoslavia, Croatia] |
| 8030 | Nemanja Vidić | [Serbia and Montenegro, Serbia] |
| 8131 | Franz Wagner | [Germany, Austria] |
| 8443 | Nikola Žigić | [Serbia and Montenegro, Serbia] |
import scipy.stats as stats
def calc_association(feature1, feature2):
contingency_table = pd.crosstab(feature1, feature2)
# Calculate the chi-square statistic and degrees of freedom
chi2_statistic, p_value, _, _ = stats.chi2_contingency(contingency_table)
#degrees_of_freedom = (contingency_table.shape[0] - 1) * (contingency_table.shape[1] - 1)
# Calculate Cramer's V
n = contingency_table.sum().sum()
min_dim = min(contingency_table.shape) - 1
cramers_v = np.sqrt(chi2_statistic / (n * min_dim))
if cramers_v < 0.1:
association_strength = 'Weak'
elif cramers_v < 0.3:
association_strength = 'Moderate'
elif cramers_v < 0.5:
association_strength = 'Strong'
else:
association_strength = 'Very strong'
return p_value,cramers_v, association_strength
significance_level = 0.05
p_value, cramers_v, association_strength = calc_association(tournaments['host_country'], tournaments['winner'])
print("p_value: ", p_value)
if p_value < significance_level:
print("There is a significant relationship between the host_country and the winner.")
print("Cramer's V:", cramers_v)
print("Association strength:", association_strength)
else:
print("There is no significant relationship between the host_country and the winner.")
p_value: 0.4133433398768274 There is no significant relationship between the host_country and the winner.
result = matches[['match_for_host','relative_attendance_category']].groupby('match_for_host')['relative_attendance_category'].agg(pd.Series.mode)
result
match_for_host False Medium True Medium Name: relative_attendance_category, dtype: object
p_value, cramers_v, association_strength = calc_association(matches['match_for_host'], matches['relative_attendance_category'])
print("p_value: ", p_value)
if p_value < significance_level:
print("There is a significant relationship between the match_for_host and the relative_attendance_category.")
print("Cramer's V:", cramers_v)
print("Association strength:", association_strength)
else:
print("There is no significant relationship between the match_for_host and the relative_attendance_category.")
p_value: 0.0019101562273923544 There is a significant relationship between the match_for_host and the relative_attendance_category. Cramer's V: 0.113968131290457 Association strength: Moderate
result = matches[['host_country_code','attendance_category']].groupby('host_country_code')['attendance_category'].agg(pd.Series.mode)
result
host_country_code ARG Medium BRA High CHE Low CHL Low DEU High ENG ['Medium', 'High'] Categories (3, object): ['L... ESP Medium FRA Medium ITA Medium JPN Medium KOR Medium MEX Low QAT Medium RUS Medium SWE Low URY Low USA High ZAF Medium Name: attendance_category, dtype: object
p_value, cramers_v, association_strength = calc_association(matches['host_country_code'], matches['attendance_category'])
print("p_value: ", p_value)
if p_value < significance_level:
print("There is a significant relationship between the host_country_code and the attendance_category.")
print("Cramer's V:", cramers_v)
print("Association strength:", association_strength)
else:
print("There is no significant relationship between the host_country_code and the attendance_category.")
p_value: 2.903577845500795e-62 There is a significant relationship between the host_country_code and the attendance_category. Cramer's V: 0.450588376944018 Association strength: Strong
tournaments_winners = tournaments[['tournament_id', 'winner']].rename(columns={"winner": "team_name"})
tournaments_winners
| tournament_id | team_name | |
|---|---|---|
| 0 | WC-1930 | Uruguay |
| 1 | WC-1934 | Italy |
| 2 | WC-1938 | Italy |
| 3 | WC-1950 | Uruguay |
| 4 | WC-1954 | West Germany |
| 5 | WC-1958 | Brazil |
| 6 | WC-1962 | Brazil |
| 7 | WC-1966 | England |
| 8 | WC-1970 | Brazil |
| 9 | WC-1974 | West Germany |
| 10 | WC-1978 | Argentina |
| 11 | WC-1982 | Italy |
| 12 | WC-1986 | Argentina |
| 13 | WC-1990 | West Germany |
| 14 | WC-1994 | Brazil |
| 15 | WC-1998 | France |
| 16 | WC-2002 | Brazil |
| 17 | WC-2006 | Italy |
| 18 | WC-2010 | Spain |
| 19 | WC-2014 | Germany |
| 20 | WC-2018 | France |
| 21 | WC-2022 | Argentina |
player_appearances = pd.read_csv('player_appearances.csv')
tournaments_players = player_appearances[['tournament_id', 'team_name', 'player_id']]
tournaments_players
| tournament_id | team_name | player_id | |
|---|---|---|---|
| 0 | WC-1970 | Mexico | P-07042 |
| 1 | WC-1970 | Mexico | P-07391 |
| 2 | WC-1970 | Mexico | P-07599 |
| 3 | WC-1970 | Mexico | P-03090 |
| 4 | WC-1970 | Mexico | P-04789 |
| ... | ... | ... | ... |
| 20613 | WC-2022 | France | P-07581 |
| 20614 | WC-2022 | France | P-01232 |
| 20615 | WC-2022 | France | P-04666 |
| 20616 | WC-2022 | France | P-04194 |
| 20617 | WC-2022 | France | P-05115 |
20618 rows × 3 columns
players_birth_dates = players[['player_id','birth_date']]
players_birth_dates
| player_id | birth_date | |
|---|---|---|
| 0 | P-08891 | 1934-09-30 |
| 1 | P-08589 | 2000-10-22 |
| 2 | P-04897 | 1934-07-03 |
| 3 | P-05556 | 1975-06-26 |
| 4 | P-08163 | 1978-08-03 |
| ... | ... | ... |
| 8480 | P-06718 | 1976-09-12 |
| 8481 | P-03465 | 1997-09-25 |
| 8482 | P-08318 | 1986-08-18 |
| 8483 | P-05921 | 1962-12-02 |
| 8484 | P-03072 | 1944-02-24 |
8485 rows × 2 columns
tournaments_players_birth_dates = pd.merge(tournaments_players, players_birth_dates, how = 'inner', on = ['player_id'])
tournaments_winners_birth_dates = pd.merge(tournaments_players_birth_dates, tournaments_winners, how = 'inner', on = ['tournament_id', 'team_name'])
tournaments_winners_birth_dates
| tournament_id | team_name | player_id | birth_date | |
|---|---|---|---|---|
| 0 | WC-1970 | Brazil | P-08384 | 1937-12-24 |
| 1 | WC-1970 | Brazil | P-08384 | 1937-12-24 |
| 2 | WC-1970 | Brazil | P-08384 | 1937-12-24 |
| 3 | WC-1970 | Brazil | P-08384 | 1937-12-24 |
| 4 | WC-1970 | Brazil | P-08384 | 1937-12-24 |
| ... | ... | ... | ... | ... |
| 1288 | WC-2018 | France | P-02651 | 1995-11-12 |
| 1289 | WC-2018 | France | P-07333 | 1985-03-28 |
| 1290 | WC-2018 | France | P-00847 | 1992-07-29 |
| 1291 | WC-2018 | France | P-09768 | 1994-07-17 |
| 1292 | WC-2018 | France | P-00067 | 1993-01-26 |
1293 rows × 4 columns
tournaments_winners_birth_dates['tournament_id'] = tournaments_winners_birth_dates['tournament_id'].apply(lambda x: int(x[3:7]))
tournaments_winners_birth_dates['birth_date'] = tournaments_winners_birth_dates['birth_date'].apply(lambda x: int(x[0:4]))
tournaments_winners_birth_dates['age'] = tournaments_winners_birth_dates['tournament_id']-tournaments_winners_birth_dates['birth_date']
tournaments_winners_birth_dates
| tournament_id | team_name | player_id | birth_date | age | |
|---|---|---|---|---|---|
| 0 | 1970 | Brazil | P-08384 | 1937 | 33 |
| 1 | 1970 | Brazil | P-08384 | 1937 | 33 |
| 2 | 1970 | Brazil | P-08384 | 1937 | 33 |
| 3 | 1970 | Brazil | P-08384 | 1937 | 33 |
| 4 | 1970 | Brazil | P-08384 | 1937 | 33 |
| ... | ... | ... | ... | ... | ... |
| 1288 | 2018 | France | P-02651 | 1995 | 23 |
| 1289 | 2018 | France | P-07333 | 1985 | 33 |
| 1290 | 2018 | France | P-00847 | 1992 | 26 |
| 1291 | 2018 | France | P-09768 | 1994 | 24 |
| 1292 | 2018 | France | P-00067 | 1993 | 25 |
1293 rows × 5 columns
winners_mean_ages = tournaments_winners_birth_dates.groupby(['tournament_id','team_name'])['age'].agg(['median', 'mean']).reset_index()
winners_mean_ages.sort_values(by = 'mean')
| tournament_id | team_name | median | mean | |
|---|---|---|---|---|
| 2 | 1978 | Argentina | 25.0 | 25.566667 |
| 12 | 2018 | France | 25.0 | 25.855670 |
| 0 | 1970 | Brazil | 26.0 | 26.291667 |
| 11 | 2014 | Germany | 26.0 | 26.765306 |
| 8 | 2002 | Brazil | 26.0 | 26.894737 |
| 10 | 2010 | Spain | 26.0 | 26.917526 |
| 4 | 1986 | Argentina | 26.0 | 26.931034 |
| 7 | 1998 | France | 28.0 | 27.195876 |
| 1 | 1974 | West Germany | 28.0 | 27.325581 |
| 5 | 1990 | West Germany | 27.5 | 27.386364 |
| 13 | 2022 | Argentina | 28.0 | 27.495575 |
| 6 | 1994 | Brazil | 28.0 | 27.955056 |
| 3 | 1982 | Italy | 28.0 | 28.616279 |
| 9 | 2006 | Italy | 29.0 | 28.765306 |
players_goals
| player_id | goals_number | full_name | list_team_name | |
|---|---|---|---|---|
| 0 | P-00004 | 1 | Julio César Cortés | [Uruguay] |
| 1 | P-00010 | 1 | Michel De Wolf | [Belgium] |
| 2 | P-00013 | 2 | Vincenzo Iaquinta | [Italy] |
| 3 | P-00021 | 5 | Hans Krankl | [Austria] |
| 4 | P-00032 | 1 | Antonio Rada | [Colombia] |
| ... | ... | ... | ... | ... |
| 1432 | P-09960 | 3 | Peregrino Anselmo | [Uruguay] |
| 1433 | P-09962 | 1 | Alberto Górriz | [Spain] |
| 1434 | P-09968 | 3 | Nico Claesen | [Belgium] |
| 1435 | P-09981 | 2 | Siniša Mihajlović | [Yugoslavia] |
| 1436 | P-09998 | 8 | Rivaldo | [Brazil] |
1437 rows × 4 columns
goals_birth_dates = pd.merge(players_goals, players_birth_dates, how ='inner', on = 'player_id')
goals_birth_dates
| player_id | goals_number | full_name | list_team_name | birth_date | |
|---|---|---|---|---|---|
| 0 | P-00004 | 1 | Julio César Cortés | [Uruguay] | 1941-03-29 |
| 1 | P-00010 | 1 | Michel De Wolf | [Belgium] | 1958-01-19 |
| 2 | P-00013 | 2 | Vincenzo Iaquinta | [Italy] | 1979-11-21 |
| 3 | P-00021 | 5 | Hans Krankl | [Austria] | 1953-02-14 |
| 4 | P-00032 | 1 | Antonio Rada | [Colombia] | 1937-06-13 |
| ... | ... | ... | ... | ... | ... |
| 1432 | P-09960 | 3 | Peregrino Anselmo | [Uruguay] | 1902-04-30 |
| 1433 | P-09962 | 1 | Alberto Górriz | [Spain] | 1958-02-16 |
| 1434 | P-09968 | 3 | Nico Claesen | [Belgium] | 1962-10-07 |
| 1435 | P-09981 | 2 | Siniša Mihajlović | [Yugoslavia] | 1969-02-20 |
| 1436 | P-09998 | 8 | Rivaldo | [Brazil] | 1972-04-19 |
1437 rows × 5 columns
goals_birth_dates['birth_date'] = goals_birth_dates['birth_date'].apply(lambda x: x[0:4])
goals_birth_dates = goals_birth_dates.sort_values(by = 'birth_date')
goals_birth_dates
| player_id | goals_number | full_name | list_team_name | birth_date | |
|---|---|---|---|---|---|
| 5 | P-00033 | 1 | Tom Florie | [United States] | 1897 |
| 1033 | P-06875 | 1 | Héctor Scarone | [Uruguay] | 1898 |
| 712 | P-04768 | 1 | Bart McGhee | [United States] | 1899 |
| 993 | P-06624 | 5 | Pedro Cea | [Uruguay] | 1900 |
| 717 | P-04792 | 3 | Raimundo Orsi | [Italy] | 1901 |
| ... | ... | ... | ... | ... | ... |
| 559 | P-03701 | 1 | Moisés Caicedo | [Ecuador] | 2001 |
| 1193 | P-08042 | 1 | Joško Gvardiol | [Croatia] | 2002 |
| 375 | P-02527 | 1 | Jude Bellingham | [England] | 2003 |
| 1119 | P-07451 | 1 | Gavi | [Spain] | 2004 |
| 976 | P-06541 | 1 | Tomás Fernández | [Cuba] | not |
1437 rows × 5 columns
fig = px.bar(goals_birth_dates,x ='birth_date', y = 'goals_number')
fig.show()
mean_goals_per_birth_date = goals_birth_dates.groupby('birth_date')['goals_number'].agg('median').reset_index(name = 'mean_goals_number')
mean_goals_per_birth_date
| birth_date | mean_goals_number | |
|---|---|---|
| 0 | 1897 | 1.0 |
| 1 | 1898 | 1.0 |
| 2 | 1899 | 1.0 |
| 3 | 1900 | 5.0 |
| 4 | 1901 | 2.5 |
| ... | ... | ... |
| 103 | 2001 | 2.0 |
| 104 | 2002 | 1.0 |
| 105 | 2003 | 1.0 |
| 106 | 2004 | 1.0 |
| 107 | not | 1.0 |
108 rows × 2 columns
fig = px.line(mean_goals_per_birth_date,x ='birth_date', y = 'mean_goals_number')
fig.show()
players_matches = player_appearances.groupby("player_id")['match_id'].count().reset_index().rename(columns={"match_id": "matches_number"})
players_matches
| player_id | matches_number | |
|---|---|---|
| 0 | P-00001 | 14 |
| 1 | P-00003 | 3 |
| 2 | P-00004 | 6 |
| 3 | P-00005 | 4 |
| 4 | P-00009 | 1 |
| ... | ... | ... |
| 5097 | P-09994 | 2 |
| 5098 | P-09996 | 1 |
| 5099 | P-09997 | 3 |
| 5100 | P-09998 | 14 |
| 5101 | P-09999 | 2 |
5102 rows × 2 columns
goals_matches_birth_dates = pd.merge(goals_birth_dates, players_matches, how ='inner', on = 'player_id')
goals_matches_birth_dates = goals_matches_birth_dates.groupby('birth_date')['goals_number','matches_number'].agg('median').reset_index()
goals_matches_birth_dates['goals/matches'] = goals_matches_birth_dates['goals_number']/goals_matches_birth_dates['matches_number']
goals_matches_birth_dates
C:\Users\dralh\AppData\Local\Temp\ipykernel_2600\2429022078.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| birth_date | goals_number | matches_number | goals/matches | |
|---|---|---|---|---|
| 0 | 1936 | 9.0 | 6.0 | 1.500000 |
| 1 | 1937 | 4.0 | 4.0 | 1.000000 |
| 2 | 1938 | 1.0 | 1.0 | 1.000000 |
| 3 | 1939 | 1.0 | 5.0 | 0.200000 |
| 4 | 1940 | 2.0 | 6.0 | 0.333333 |
| ... | ... | ... | ... | ... |
| 64 | 2000 | 1.0 | 4.0 | 0.250000 |
| 65 | 2001 | 2.0 | 4.0 | 0.500000 |
| 66 | 2002 | 1.0 | 7.0 | 0.142857 |
| 67 | 2003 | 1.0 | 5.0 | 0.200000 |
| 68 | 2004 | 1.0 | 4.0 | 0.250000 |
69 rows × 4 columns
fig = px.line(goals_matches_birth_dates,x ='birth_date', y = 'goals_number')
fig.show()
fig = px.line(goals_matches_birth_dates,x ='birth_date', y = 'goals_number')
fig.show()
fig = px.line(goals_matches_birth_dates,x ='birth_date', y = 'goals/matches')
fig.show()
res1 = matches.groupby('tournament_id')['match_id'].count().reset_index(name = 'matches_number')
res1
| tournament_id | matches_number | |
|---|---|---|
| 0 | WC-1930 | 18 |
| 1 | WC-1934 | 17 |
| 2 | WC-1938 | 18 |
| 3 | WC-1950 | 22 |
| 4 | WC-1954 | 26 |
| 5 | WC-1958 | 35 |
| 6 | WC-1962 | 32 |
| 7 | WC-1966 | 32 |
| 8 | WC-1970 | 32 |
| 9 | WC-1974 | 38 |
| 10 | WC-1978 | 38 |
| 11 | WC-1982 | 52 |
| 12 | WC-1986 | 52 |
| 13 | WC-1990 | 52 |
| 14 | WC-1994 | 52 |
| 15 | WC-1998 | 64 |
| 16 | WC-2002 | 64 |
| 17 | WC-2006 | 64 |
| 18 | WC-2010 | 64 |
| 19 | WC-2014 | 64 |
| 20 | WC-2018 | 64 |
| 21 | WC-2022 | 64 |
res2 = squads.groupby('tournament_id')['team_id'].nunique().reset_index(name = 'teams_number')
res2
| tournament_id | teams_number | |
|---|---|---|
| 0 | WC-1930 | 13 |
| 1 | WC-1934 | 16 |
| 2 | WC-1938 | 15 |
| 3 | WC-1950 | 13 |
| 4 | WC-1954 | 16 |
| 5 | WC-1958 | 16 |
| 6 | WC-1962 | 16 |
| 7 | WC-1966 | 16 |
| 8 | WC-1970 | 16 |
| 9 | WC-1974 | 16 |
| 10 | WC-1978 | 16 |
| 11 | WC-1982 | 24 |
| 12 | WC-1986 | 24 |
| 13 | WC-1990 | 24 |
| 14 | WC-1994 | 24 |
| 15 | WC-1998 | 32 |
| 16 | WC-2002 | 32 |
| 17 | WC-2006 | 32 |
| 18 | WC-2010 | 32 |
| 19 | WC-2014 | 32 |
| 20 | WC-2018 | 32 |
| 21 | WC-2022 | 32 |